{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#特征结构\n",
    "kim={'CAT':'NP', 'ORTH':'Kim', 'REF':'k'}#CAT:文法类别，orth：拼写\n",
    "chase={'CAT':'V', 'ORTH':'chased', 'REL':'chase'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "k\n",
      "chase\n"
     ]
    }
   ],
   "source": [
    "#还有一个面向语意的特征\n",
    "print(kim['REF'])#kim['REF']表示Kim的指示物\n",
    "print(chase['REL'])#chase['REL']表示chase表示的关系"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'AGT': 'sbj', 'CAT': 'V', 'ORTH': 'chased', 'PAT': 'obj', 'REL': 'chase'}"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#特征结构包含各种有关文法的实体信息，这些信息不需要详尽介绍，\n",
    "#而是要进一步添加属性，先用sbj和obj作为占位符\n",
    "chase['AGT']='sbj'#agent(施事)\n",
    "chase['PAT']='obj'#patient（受事）\n",
    "chase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ORTH => chased\n",
      "REL => chase\n",
      "AGT => k\n",
      "PAT => l\n"
     ]
    }
   ],
   "source": [
    "#动词与文法参数结合时，占位符会被填充\n",
    "sent='Kim chased Lee'\n",
    "tokens=sent.split()\n",
    "lee={'CAT':'NP','ORTH':'Lee','REF':'l'}\n",
    "def lex2fs(word):\n",
    "    for fs in [kim,lee,chase]:\n",
    "        if fs['ORTH']==word:\n",
    "            return fs\n",
    "subj,verb,obj=lex2fs(tokens[0]),lex2fs(tokens[1]),lex2fs(tokens[2])\n",
    "verb['AGT']=subj['REF']#chase 的agent(施事)者是tim\n",
    "verb['PAT']=obj['REF']#chase的patient（受事）者是lee\n",
    "for k in ['ORTH','REL','AGT','PAT']:#查看chase的特征结构\n",
    "    print(k,'=>',verb[k])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#使用同样方法应用于不同动词————say和surprise#\n",
    "#不过主语会扮演source（源事SRC）的角色，宾语扮演experiencer（体验者EXP）的角色\n",
    "surprise={'SRC': 'sbj', 'CAT': 'V', 'ORTH': 'surprised', 'EXP': 'obj', 'REL': 'surprise'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#显示无关上下文文法和分析如何能扩展到合适的特征结构，用更通用的方式建立这样的分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#句法协议\n",
    "(1) a. this dog\n",
    "    b. *these dog\n",
    "(2) a. these dogs\n",
    "    b. *this dogs\n",
    "(3) a. the dog runs\n",
    "    b. *the dog run\n",
    "(4) a. the dogs run\n",
    "    b. *the dogs runs\n",
    "#动词的形态属性同主语名词短语的句法属性一起变化，该过程称为协议\n",
    "#插图\n",
    "\n",
    "\n",
    "(5) a. the dog        run-s\n",
    "    b.     dog.3.SG   run-3.SG\n",
    "(6) a. the dog-s      run\n",
    "    b.     dog.3.PL   run-3.PL\n",
    "    \n",
    "#\n",
    "(7) s   -> NP VP\n",
    "    NP  -> Det N\n",
    "    VP  -> V\n",
    "    Det -> 'this'\n",
    "    N   -> 'dog'\n",
    "    V   -> 'runs'\n",
    "    \n",
    "#为了得到---> these dogs run,最简单的方法是为文法添加新的非终结符和产生式\n",
    "(8) s -> NP_SG VP_SG\n",
    "    s -> NP_PL VP_PL\n",
    "    NP_SG -> Det_SG N_SG\n",
    "    NP_PL -> Det_PL N_PL\n",
    "    \n",
    "    Det_SG -> 'this'\n",
    "    Det_PL -> 'these'\n",
    "    N_SG   -> 'dog'\n",
    "    N_PL   -> 'dogs'\n",
    "    V_SG   -> 'runs'\n",
    "    V_PL   -> 'run'\n",
    "#除了唯一扩展S的产生式，还有两个产生式，一个是单数主语NP和VP的句子，另一个是包含复数主语NP和VP的句子\n",
    "#图中每个产生式在（8）中都有两个对应项，在涵盖了一定量英语成分子集的文法中，会产生爆炸式增长\n",
    "\n",
    "#以下将展示数量和人称协议的一致性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#使用属性和约束\n",
    "(9) N[NUM=pl]#N有一个特征叫做NUM，此特征的值是pl\n",
    "\n",
    "#例子\n",
    "(10)Det[NUM=sg]-> 'this'\n",
    "    Det[NUM=pl]-> 'these'\n",
    "    N[NUM=sg]  -> 'dog'\n",
    "    N[NUM=pl]  -> 'dogs'\n",
    "    V[NUM=sg]  -> 'runs'\n",
    "    V[NUM=pl]  -> 'run'\n",
    "    \n",
    "    \n",
    "#允许使用特征值变量表示限制\n",
    "(11)S -> NP[NUM=?n]VP[NUM=?n]\n",
    "    NP[NUM=?n]-> Det[NUM=?n]N[NUM=?n]\n",
    "    VP[NUM=?n]-> V[NUM=?n]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#图\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#有其他限定词不限制名词的文法数量，描述方法是在文法中添加两个词汇条目\n",
    "Det[NUM=sg]-> 'the' | 'some' | 'several'\n",
    "Det[NUM=pl]-> 'the' | 'some' | 'several'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#更好的方法：不限制NUM的值，让其保存相关词汇的数量值，为NUM设定一个变量\n",
    "Det[NUM=?n]-> 'the' | 'some' | 'several'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "% start S\n",
      "# ###################\n",
      "# Grammar Productions\n",
      "# ###################\n",
      "# S expansion productions\n",
      "S -> NP[NUM=?n] VP[NUM=?n]\n",
      "# NP expansion productions\n",
      "NP[NUM=?n] -> N[NUM=?n] \n",
      "NP[NUM=?n] -> PropN[NUM=?n] \n",
      "NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]\n",
      "NP[NUM=pl] -> N[NUM=pl] \n",
      "# VP expansion productions\n",
      "VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]\n",
      "VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP\n",
      "# ###################\n",
      "# Lexical Productions\n",
      "# ###################\n",
      "Det[NUM=sg] -> 'this' | 'every'\n",
      "Det[NUM=pl] -> 'these' | 'all'\n",
      "Det -> 'the' | 'some' | 'several'\n",
      "PropN[NUM=sg]-> 'Kim' | 'Jody'\n",
      "N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'\n",
      "N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' \n",
      "IV[TENSE=pres,  NUM=sg] -> 'disappears' | 'walks'\n",
      "TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'\n",
      "IV[TENSE=pres,  NUM=pl] -> 'disappear' | 'walk'\n",
      "TV[TENSE=pres, NUM=pl] -> 'see' | 'like'\n",
      "IV[TENSE=past] -> 'disappeared' | 'walked'\n",
      "TV[TENSE=past] -> 'saw' | 'liked'\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.data.show_cfg('book_grammars/feat0.fcfg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#一个句法类别可以有多个特征，比如V[TENSE=pres,NUM=pl],一般倾向于添加特征\n",
    "# start S 表示以S作为文法的开始符号"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "|.Kim .like.chil.|\n",
      "Leaf Init Rule:\n",
      "|[----]    .    .| [0:1] 'Kim'\n",
      "|.    [----]    .| [1:2] 'likes'\n",
      "|.    .    [----]| [2:3] 'children'\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|[----]    .    .| [0:1] PropN[NUM='sg'] -> 'Kim' *\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|[----]    .    .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|[---->    .    .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|.    [----]    .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|.    [---->    .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|.    .    [----]| [2:3] N[NUM='pl'] -> 'children' *\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|.    .    [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *\n",
      "Feature Bottom Up Predict Combine Rule:\n",
      "|.    .    [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}\n",
      "Feature Single Edge Fundamental Rule:\n",
      "|.    [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *\n",
      "Feature Single Edge Fundamental Rule:\n",
      "|[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *\n",
      "(S[]\n",
      "  (NP[NUM='sg'] (PropN[NUM='sg'] Kim))\n",
      "  (VP[NUM='sg', TENSE='pres']\n",
      "    (TV[NUM='sg', TENSE='pres'] likes)\n",
      "    (NP[NUM='pl'] (N[NUM='pl'] children))))\n"
     ]
    }
   ],
   "source": [
    "#NLTK中使用Earley图表分析器分析基于特征的文法，输出tree的数量取决于是否有句法歧义\n",
    "tokens = 'Kim likes children'.split()\n",
    "from nltk import load_parser \n",
    "cp = load_parser('book_grammars/feat0.fcfg',trace=2) \n",
    "for tree in cp.parse(tokens):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#术语 \n",
    "sg,pl 这样的简单值通常被称作原子，原子值的一种特殊情况是布尔值\n",
    "AUX就是一个典型的布尔特征，标记是否为助动词\n",
    "\n",
    "V[TENSE=pres,+aux]  ->'can'\n",
    "V[TENSE=pres,-aux=+]->'walks'\n",
    "\n",
    "#除了原子值特征，特征可能本身就是特征结构的值，比如：可以将协议特征组合在一起（比如人称，数量和性别）作为一个类别的不同部分，表示为AGR\n",
    "\n",
    "#属性矩阵\n",
    "\n",
    "[POS = N           ]\n",
    "[                  ]\n",
    "[AGR = [PER = 3   ]]\n",
    "[      [NUM = pl  ]]\n",
    "[      [GND = fem ]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "AGR示例\n",
    "S->NP[AGR=?n] VP[AGR=?n]\n",
    "NP[AGR=?n]->PropN[AGR=?n]\n",
    "VP[TENSE=?t,AGR=?n]->Cop[TENSE=?t,AGR=?n] Adj\n",
    "\n",
    "Cop[TENSE=pres,AGR=[NUM=sg,PER=3]] -> 'is'\n",
    "PropN[AGR=[NUM=sg,PER=3]]->'Kim'\n",
    "Adj->'happy'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "9.2处理特征结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ NUM   = 'sg'   ]\n",
      "[ TENSE = 'past' ]\n"
     ]
    }
   ],
   "source": [
    "fsl=nltk.FeatStruct(TENSE='past',NUM='sg')\n",
    "print(fsl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ CASE  = 'acc'  ]\n",
      "[ NUM   = 'sg'   ]\n",
      "[ TENSE = 'past' ]\n"
     ]
    }
   ],
   "source": [
    "#给其赋值\n",
    "fsl['CASE']='acc'\n",
    "print(fsl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[       [ CASE  = 'acc'  ] ]\n",
      "[ AGR = [ NUM   = 'sg'   ] ]\n",
      "[       [ TENSE = 'past' ] ]\n",
      "[                          ]\n",
      "[ POS = 'N'                ]\n"
     ]
    }
   ],
   "source": [
    "fs2=nltk.FeatStruct(POS='N',AGR=fsl)\n",
    "print(fs2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ age   = '33'             ]\n",
      "[ name  = 'Lee'            ]\n",
      "[ telnp = '01 27 86 42 96' ]\n"
     ]
    }
   ],
   "source": [
    "#特征结构不依赖于语言对象，特征结构是表示知识的通用目的的结构\n",
    "print(nltk.FeatStruct(name='Lee',telnp='01 27 86 42 96',age='33'))#表示人物信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#可以将特征结构看作是图，具体来说，是有向无环图\n",
    "（21）图\n",
    "（22）图#特征值可以是复杂的\n",
    "#标签的元祖表示路径，因此（address，street）就是一个特征路径，表示标签为‘reu Pascal’节点\n",
    "（23）图#情况\n",
    "（24）图#address和（spouse，address）的值相同，这种情况被称为结构共享或者重入，当两条路径具有相同值时，它们被称为是等价的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ ADDRESS = (1) [ NUMBER = 74           ] ]\n",
      "[               [ STREET = 'rue Pascal' ] ]\n",
      "[                                         ]\n",
      "[ NAME    = 'Lee'                         ]\n",
      "[                                         ]\n",
      "[ SPOUSE  = [ ADDRESS -> (1)  ]           ]\n",
      "[           [ NAME    = 'Kim' ]           ]\n"
     ]
    }
   ],
   "source": [
    "#为了表示重入，在共享的特征结构第一次出现的地方加一个括号括起的数字前缀比如（1）\n",
    "print( nltk.FeatStruct(\"[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],SPOUSE=[NAME='Kim', ADDRESS->(1)]]\"))\n",
    "#括号内的整数被称为标记或者是同指标记"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "包含和统一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#特征结构提供一些对象的部分信息，可以根据特征结构的通用程度给其进行排序\n",
    "a.  [NUMBER = 74]#更一般的\n",
    "b.  [NUMBER = 74 ]\n",
    "    [STREET = 'rue Pascal']\n",
    "c.  [NUMBER = 74 ]\n",
    "    [STREET = 'rue Pascal']\n",
    "    [CITY = 'Paris' ]\n",
    "#这个顺序称为包含，一个更一般的结构包含一个较一般的\n",
    "（23）包含（24（等价路径那个图））是因为后者有额外的等价路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ CITY   = 'Paris'      ]\n",
      "[ NUMBER = 74           ]\n",
      "[ STREET = 'rue Pascal' ]\n"
     ]
    }
   ],
   "source": [
    "#一些特征结构比其他的更具体，如何去具体化一个给定的特征结构\n",
    "#图（27）\n",
    "#合并两个特征结构的信息被称为统一使用方法unify\n",
    "fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')\n",
    "fs2 = nltk.FeatStruct(CITY='Paris')\n",
    "print (fs1.unify(fs2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ CITY   = 'Paris'      ]\n",
      "[ NUMBER = 74           ]\n",
      "[ STREET = 'rue Pascal' ]\n"
     ]
    }
   ],
   "source": [
    "#统一是二元操作\n",
    "print (fs2.unify(fs1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "#统一的特征原子值不同时，设置统一的结果为None\n",
    "fs0 = nltk.FeatStruct(A='a')\n",
    "fs1 = nltk.FeatStruct(A='b')\n",
    "fs2 = fs0.unify(fs1)\n",
    "print (fs2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ ADDRESS = [ NUMBER = 74           ]               ]\n",
      "[           [ STREET = 'rue Pascal' ]               ]\n",
      "[                                                   ]\n",
      "[ NAME    = 'Lee'                                   ]\n",
      "[                                                   ]\n",
      "[           [ ADDRESS = [ NUMBER = 74           ] ] ]\n",
      "[ SPOUSE  = [           [ STREET = 'rue Pascal' ] ] ]\n",
      "[           [                                     ] ]\n",
      "[           [ NAME    = 'Kim'                     ] ]\n"
     ]
    }
   ],
   "source": [
    "#统一与包含的相互作用\n",
    "fs0 = nltk.FeatStruct(\"[NAME=Lee,ADDRESS=[NUMBER=74,STREET='rue Pascal'], SPOUSE= [NAME=Kim, ADDRESS=[NUMBER=74, STREET='rue Pascal']]]\")\n",
    "print (fs0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ ADDRESS = [ NUMBER = 74           ]               ]\n",
      "[           [ STREET = 'rue Pascal' ]               ]\n",
      "[                                                   ]\n",
      "[ NAME    = 'Lee'                                   ]\n",
      "[                                                   ]\n",
      "[           [           [ CITY   = 'Paris'      ] ] ]\n",
      "[           [ ADDRESS = [ NUMBER = 74           ] ] ]\n",
      "[ SPOUSE  = [           [ STREET = 'rue Pascal' ] ] ]\n",
      "[           [                                     ] ]\n",
      "[           [ NAME    = 'Kim'                     ] ]\n"
     ]
    }
   ],
   "source": [
    "fs1 = nltk.FeatStruct(\"[SPOUSE = [ADDRESS = [CITY = Paris]]]\")\n",
    "print (fs1.unify(fs0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ ADDRESS = (1) [ NUMBER = 74           ] ]\n",
      "[               [ STREET = 'rue Pascal' ] ]\n",
      "[                                         ]\n",
      "[ NAME    = 'Lee'                         ]\n",
      "[                                         ]\n",
      "[ SPOUSE  = [ ADDRESS -> (1)  ]           ]\n",
      "[           [ NAME    = 'Kim' ]           ]\n"
     ]
    }
   ],
   "source": [
    "fs2 = nltk.FeatStruct(\"[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],SPOUSE=[NAME=Kim, ADDRESS->(1)]]\")\n",
    "print(fs2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[               [ CITY   = 'Paris'      ] ]\n",
      "[ ADDRESS = (1) [ NUMBER = 74           ] ]\n",
      "[               [ STREET = 'rue Pascal' ] ]\n",
      "[                                         ]\n",
      "[ NAME    = 'Lee'                         ]\n",
      "[                                         ]\n",
      "[ SPOUSE  = [ ADDRESS -> (1)  ]           ]\n",
      "[           [ NAME    = 'Kim' ]           ]\n"
     ]
    }
   ],
   "source": [
    "print(fs1.unify(fs2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "同时更新两个地址"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#结构共享可以使用变量表示 比如?x\n",
    "fs1 = nltk.FeatStruct(\"[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]\")\n",
    "fs2 = nltk.FeatStruct(\"[ADDRESS1=?x, ADDRESS2=?x]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ ADDRESS1 = ?x ]\n",
      "[ ADDRESS2 = ?x ]\n",
      "[ ADDRESS1 = (1) [ NUMBER = 74           ] ]\n",
      "[                [ STREET = 'rue Pascal' ] ]\n",
      "[                                          ]\n",
      "[ ADDRESS2 -> (1)                          ]\n"
     ]
    }
   ],
   "source": [
    "print(fs2)\n",
    "print (fs2.unify(fs1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "9.3扩展基于特征的文法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#自列别\n",
    "\n",
    "VP -> IV\n",
    "VP -> TV NP\n",
    "\n",
    "#虽然我们知道 IV 和 TV 是两种 V，它们只是一个 原子非终结符\n",
    "\n",
    "#这个符号不会告诉我们任何有关一般动词的信息，比如不能说  V 类的所有词汇都按时态标记\n",
    "#通过允许词汇类别支持子类别特征\n",
    "VP[TENSE=?t, NUM=?n] -> V[SUBCAT=intrans, TENSE=?t, NUM=?n]\n",
    "VP[TENSE=?t, NUM=?n] -> V[SUBCAT=trans, TENSE=?t, NUM=?n] NP\n",
    "VP[TENSE=?t, NUM=?n] -> V[SUBCAT=clause, TENSE=?t, NUM=?n] SBar\n",
    "\n",
    "V[SUBCAT=intrans, TENSE=pres, NUM=sg] -> 'disappears' | 'walks'\n",
    "V[SUBCAT=trans, TENSE=pres, NUM=sg] -> 'sees' | 'likes'\n",
    "V[SUBCAT=clause, TENSE=pres, NUM=sg] -> 'says' | 'claims'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#从句的产生式分析\n",
    "SBar -> Comp S\n",
    "Comp -> 'that'\n",
    "#图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#现在有一句话是put the book on the table，可以表示为\n",
    "V[SUBCAT=<NP,NP,PP>] #NP表示主语，后面的PP和NP是补语子类别\n",
    "\n",
    "\n",
    "图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "V[SUBCAT=<NP>]#一个从句成为一种不需要更多参数的动词类别\n",
    "\n",
    "\n",
    "#图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "核心词回顾\n",
    "V类的表达式是VP类的短语的核心\n",
    "N是NP的核心，A是AP的核心\n",
    "并非所有的词都带有核心词，现在希望能通过文法形式表达它持有的父母/核心子女关系\n",
    "X-bar 句法通过抽象出 短语级别的概念，解决了这个问题\n",
    "\n",
    "\n",
    "如果 N 表示的词汇级别，那么 N’表示更高一层级别\n",
    "N’’表示短语级别，对应类别 NP\n",
    "图\n",
    "核心词是 N\n",
    "而 N'和 N''被称为 N 的 （ 短语的 ） 投影。N''是 最大的投影 ，\n",
    "N 有时也被称为 零投影\n",
    "X-bar 文法一个中心思想是所有成分都有结构的类似性\n",
    "我们说一个词汇核心 X 的直接补语子类别总是位于核心词的兄\n",
    "弟的位置，而修饰成分位于中间类别 X'的兄弟的位置\n",
    "图37\n",
    "\n",
    "\n",
    "S -> N[BAR=2] V[BAR=2]\n",
    "N[BAR=2] -> Det N[BAR=1]\n",
    "N[BAR=1] -> N[BAR=1] P[BAR=2]\n",
    "N[BAR=1] -> N[BAR=0] P[BAR=2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "助动词与倒装\n",
    "(39) a. Do you like children?\n",
    "     b. Can Jody walk?\n",
    "(40) a. Rarely do you see Kim.\n",
    "     b. Never have I seen this dog\n",
    "然而，我们不能仅仅把任意动词放在主语前面：\n",
    "(41) a. *Like you children?\n",
    "     b. *Walks Jody?\n",
    "(42) a. *Rarely see you Kim.\n",
    "     b. *Never saw I this dog.\n",
    "S[+INV] -> V[+AUX] NP VP\n",
    "标记有[+inv]的从句包含一个助动词，其后跟着一个 VP\n",
    "图44"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "无限制依赖成分\n",
    "\n",
    "\n",
    "(45) a. You like Jody.\n",
    "     b. *You like.\n",
    "(46) a. You put the card into the slot.\n",
    "     b. *You put into the slot.\n",
    "     c. *You put the card.\n",
    "     d. *You put.\n",
    "    \n",
    "动词 like 需要一个 NP 补语，而 put 需要一个跟随其后的 NP 和 PP 。（45）和（46）表\n",
    "明，这些补语是必须的：省略它们会导致不符合语法。\n",
    "\n",
    "然而，有些上下文中强制性的补语可以省略\n",
    "\n",
    "\n",
    "(47) a. Kim knows who you like.\n",
    "     b. This music, you really like.\n",
    "(48) a. Which card do you put into the slot?\n",
    "     b. Which slot do you put the card into?\n",
    "如果句子中有是适当的填充，一个强制性补语可以被省略\n",
    "（48）中的 wh 短语 which card/slot。\n",
    "通常说类似(47)和(48)中的句子包含一个 缺口，在那里强制性补语被省略了\n",
    "\n",
    "\n",
    "(49) a. Which card do you put __ into the slot?\n",
    "     b. Which slot do you put the card into __?\n",
    "句子中的依赖关系的距离没有上界，因为可以无限加深句子补语的递归\n",
    "\n",
    "(52) a. Who do you like __?\n",
    "     b. Who do you claim that you like __?\n",
    "     c. Who do you claim that Jody says that you like __?\n",
    "        \n",
    "Y/XP；我们解释为：类别 Y 的短语缺少一个类别 XP 的子成分\n",
    "图53\n",
    "\n",
    "树的顶端部分引入了填充词 who（作为 NP[+wh]类表达式对待）和相应的包含成分 S/NP 的缺口一起\n",
    "缺口信息于是被向着树的下方通过 VP/NP 类别“预填充”\n",
    "\n",
    "我们可以将它们纳入我们现有的基于特征的框架\n",
    "通过将斜线作为一个特征，它右边的类别作为值；也就是说，S/NP 可变为 S[SLASH=NP]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "% start S\n",
      "# ###################\n",
      "# Grammar Productions\n",
      "# ###################\n",
      "S[-INV] -> NP VP\n",
      "S[-INV]/?x -> NP VP/?x\n",
      "S[-INV] -> NP S/NP\n",
      "S[-INV] -> Adv[+NEG] S[+INV]\n",
      "S[+INV] -> V[+AUX] NP VP\n",
      "S[+INV]/?x -> V[+AUX] NP VP/?x\n",
      "SBar -> Comp S[-INV]\n",
      "SBar/?x -> Comp S[-INV]/?x\n",
      "VP -> V[SUBCAT=intrans, -AUX]\n",
      "VP -> V[SUBCAT=trans, -AUX] NP\n",
      "VP/?x -> V[SUBCAT=trans, -AUX] NP/?x\n",
      "VP -> V[SUBCAT=clause, -AUX] SBar\n",
      "VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x\n",
      "VP -> V[+AUX] VP\n",
      "VP/?x -> V[+AUX] VP/?x\n",
      "# ###################\n",
      "# Lexical Productions\n",
      "# ###################\n",
      "V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing'\n",
      "V[SUBCAT=trans, -AUX] -> 'see' | 'like'\n",
      "V[SUBCAT=clause, -AUX] -> 'say' | 'claim'\n",
      "V[+AUX] -> 'do' | 'can'\n",
      "NP[-WH] -> 'you' | 'cats'\n",
      "NP[+WH] -> 'who'\n",
      "Adv[+NEG] -> 'rarely' | 'never'\n",
      "NP/NP ->\n",
      "Comp -> 'that'\n"
     ]
    }
   ],
   "source": [
    "#例 9.3\n",
    "nltk.data.show_cfg('book_grammars/feat1.fcfg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "包含一个“缺口引进”产生式，即 S[-INV] -> NP S/NP\n",
    "可以为一个成分的父母 VP 指定斜线值，只要也为孩子 SBar 指定同样的值\n",
    "NP/NP ->         允许 NP上的斜线信息为空字符串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S[-INV]\n",
      "  (NP[+WH] who)\n",
      "  (S[+INV]/NP[]\n",
      "    (V[+AUX] do)\n",
      "    (NP[-WH] you)\n",
      "    (VP[]/NP[]\n",
      "      (V[-AUX, SUBCAT='clause'] claim)\n",
      "      (SBar[]/NP[]\n",
      "        (Comp[] that)\n",
      "        (S[-INV]/NP[]\n",
      "          (NP[-WH] you)\n",
      "          (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))\n"
     ]
    }
   ],
   "source": [
    "tokens = 'who do you claim that you like'.split()\n",
    "from nltk import load_parser\n",
    "cp = load_parser('book_grammars/feat1.fcfg')\n",
    "for tree in cp.parse(tokens):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S[-INV]\n",
      "  (NP[-WH] you)\n",
      "  (VP[]\n",
      "    (V[-AUX, SUBCAT='clause'] claim)\n",
      "    (SBar[]\n",
      "      (Comp[] that)\n",
      "      (S[-INV]\n",
      "        (NP[-WH] you)\n",
      "        (VP[] (V[-AUX, SUBCAT='trans'] like) (NP[-WH] cats))))))\n"
     ]
    }
   ],
   "source": [
    "tokens ='you claim that you like cats'.split()\n",
    "for tree in cp.parse(tokens):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S[-INV]\n",
      "  (Adv[+NEG] rarely)\n",
      "  (S[+INV]\n",
      "    (V[+AUX] do)\n",
      "    (NP[-WH] you)\n",
      "    (VP[] (V[-AUX, SUBCAT='intrans'] sing))))\n"
     ]
    }
   ],
   "source": [
    "#此外，还允许没有wh结构的倒装句\n",
    "tokens ='rarely do you sing'.split()\n",
    "for tree in cp.parse(tokens):\n",
    "    print(tree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
